In this notebook we'll be looking at the results of the cash controller
that is trained with an entropy_coefficient, which encourages the
network to propose a diversee set of ML frameworks instead of quickly
converging to a single one.
The results we'll be looking at are from the following floydhub jobs:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import seaborn as sns
from plotly import subplots
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=False)
sns.set_style("whitegrid")
%matplotlib inline
# make a mapping of each job and their `entropy_coef` setting.
# this is just manually taken from the floydhub links above.
JOB_ENTROPY_COEF_MAP = {
168: 0.1,
169: 1.0,
170: 10,
171: 100,
172: 0.2,
173: 0.4,
174: 0.6,
175: 0.8
}
JOB_NUMS = JOB_ENTROPY_COEF_MAP.keys()
results = pd.concat([
pd.read_csv("../floyd_outputs/%d/rnn_metalearn_controller_experiment.csv" % job_num)
.assign(job_number=job_num)
.assign(
entropy_coef=lambda df: df.job_number.map(JOB_ENTROPY_COEF_MAP),
job_trial_id=lambda df: df.job_number.astype(str).str.cat(
df.trial_number.astype(str), sep="-"))
for job_num in JOB_NUMS
])
results.head()
Compute the exponentially-weighted mean of the metrics for each job.
JOB_TRIAL_IDS = list(results.job_trial_id.unique())
METRICS = [
"losses",
"aggregate_gradients",
"best_validation_scores",
"mean_rewards",
"mean_validation_scores",
"n_successful_mlfs",
"mlf_diversity",
"hyperparam_diversity",
]
mean_results = (
results
.set_index(["episode", "entropy_coef", "job_trial_id"])
.groupby("job_trial_id")
.apply(lambda df: df[METRICS].ewm(alpha=0.05).mean())
.reset_index()
)
mean_results.head()
import colorlover
import math
JOB_TRIAL_IDS = list(results.job_trial_id.unique())
PALETTE = colorlover.scales["9"]["qual"]["Paired"]
METRIC_PALETTE_MAP = {
m: PALETTE[0 - len(JOB_TRIAL_IDS): ]
for i, m in enumerate(METRICS)
}
def subplot_coords(iterable, ncols, one_indexed=True):
n = len(iterable)
nrows = math.ceil(n / ncols)
offset = 1 if one_indexed else 0
return {
"nrows": nrows,
"ncols": ncols,
"coords": [(i + offset, j + offset)
for i in range(nrows)
for j in range(ncols)]
}
def create_time_series(x, y, group_name, showlegend, group_colormap=None):
line_dict = dict(width=1)
if group_colormap is not None:
line_dict.update(dict(color=group_colormap[group_name]))
return go.Scatter(
x=x,
y=y,
name=group_name,
legendgroup=group_name,
mode='lines',
line=line_dict,
showlegend=showlegend,
opacity=0.7,
)
def create_multi_time_series(
results, group_column, metric, legend_metric="mlf_diversity"):
groups = results[group_column].unique()
cm = {g: PALETTE[i] for i, g in enumerate(groups)}
showlegend = True if metric == legend_metric else False
return (
results
.groupby(group_column)
.apply(lambda df: create_time_series(
df["episode"],
df[metric],
df[group_column].iloc[0],
showlegend,
cm
))
.tolist())
coords = subplot_coords(METRICS, 2)
fig = subplots.make_subplots(
rows=coords["nrows"],
cols=coords["ncols"],
subplot_titles=METRICS,
vertical_spacing=0.1,
print_grid=False)
for i, metric in enumerate(METRICS):
traces = create_multi_time_series(
mean_results,
"entropy_coef",
metric,
)
row_i = coords["coords"][i][0]
col_i = coords["coords"][i][1]
for trace in traces:
fig.append_trace(trace, row_i, col_i)
# add x-axis titles on the bottom ncols plots
if i >= (coords["ncols"] * coords["nrows"] - coords["ncols"]):
xax = "xaxis%s" % ("" if i == 0 else i + 1)
fig.layout[xax].update({"title": "episode"})
fig.layout.update({
"height": 800,
});
The model fit metrics that we want to look at to assess the different
qualities of the controllers that we've trained under various entropy_coef
settings are the following:
losses and aggregate_gradients provide a general sense of the shape of
the objective function during the course of training. Roughly speaking,
a negative loss indicates that the controller's observed reward is worse
than its expected reward, and a positive loss indicates the converse.best_validation_scores, mean_validation_scores, and mean_rewards
are different ways of looking at the validation performance of the
controller's proposed ML frameworks.n_successful_mlfs is a "nice-to-know" metric to indicate the number
of successful ML frameworks per episode (i.e. those that did not produce
a fit or scoring error during MLF fitting/evaluation). This should really
be normalized as a percentage of iterations (MLF proposals) per episode.mlf_diversity indicates the diversity of MLFs proposed by the controller
per episode: 1.0 indicates all proposals are unique MLFs, while 0.0 indicates
that all proposed MLFs are the same.hyperparam_diversity is similar to mlf_diversity in meaning but measures
the diversity in hyperparameter settings.iplot(fig)
entropy_coef = 0.1 (light blue) yields the highest mean_rewards,
but note that the mlf_diversity and hyperparam_diversity indicates that the
controller converges to the same ML framework (with varying hyperparameter settings)
at around 200 episodes.entropy_coef to a larger and larger value results in controllers that
end up exploring for the duration of training and do not converge to a smaller set of
ML frameworks.algorithm_components available to the controller as of
git commit 761b9cf, entropy_coef=0.2 seems to be the "goldilocks" setting in which
mean_rewards increases to ~70% which still proposing a diverse set of MLFs.best_validation_scores achieved by higher entropy_coef
controllers are still fairly high (~90% f1_scores) in the case of entropy_coef=0.4.# TODO: compute the exponential moving average of metrics per dataset
mean_results_by_data = (
results
.set_index(
["episode", "entropy_coef", "job_trial_id", "data_env_names"])
.groupby(["job_trial_id", "data_env_names"])
.apply(lambda df: df[METRICS].ewm(alpha=0.05).mean())
.reset_index()
)
mean_results_by_data.head()
from collections import defaultdict
COLORMAP = {
g: PALETTE[i] for i, g in
enumerate(mean_results_by_data.entropy_coef.unique())}
def time_series(df, legend_metric="anneal"):
line_dict = dict(width=1)
entropy_coef = df["entropy_coef"].iloc[0]
env_name = df["data_env_names"].iloc[0]
color = COLORMAP.get(entropy_coef)
showlegend = True if env_name == legend_metric else False
if color is not None:
line_dict.update(dict(color=color))
entropy_coef_display = "%s" % entropy_coef
return go.Scatter(
x=df["episode"],
y=df['mean_rewards'],
name=entropy_coef,
legendgroup=entropy_coef,
line=line_dict,
mode='lines',
showlegend=showlegend,
)
# time_series_data is a dict where the key is
# the env_name and value is the corresponding
# trace.
_time_series_data = (
mean_results_by_data
.groupby(["data_env_names", "job_trial_id"])
.apply(time_series)
.to_dict()
)
time_series_data = defaultdict(dict)
for (data_env, job_num), trace in _time_series_data.items():
time_series_data[data_env][job_num] = trace
coords = subplot_coords(time_series_data, 3)
fig = subplots.make_subplots(
rows=coords["nrows"],
cols=coords["ncols"],
subplot_titles=list(time_series_data.keys()),
vertical_spacing=0.1,
print_grid=False)
for i, (data_env, traces) in enumerate(time_series_data.items()):
row_i, col_i = coords["coords"][i][0], coords["coords"][i][1]
for job_num, trace in traces.items():
fig.append_trace(trace, row_i, col_i)
# add x-axis titles on the bottom ncols plots
if i >= (coords["ncols"] * coords["nrows"] - coords["ncols"]):
xax = "xaxis%s" % ("" if i == 0 else i + 1)
fig.layout[xax].update({"title": "episode"})
fig.layout.update({
"height": 800,
});
iplot(fig)
The same general pattern can be seen here with the per-dataset mean_rewards for each
controller trained under different entropy_coef settings, as in the above notes
section.
# analyze the MLF pipelines proposed in floyd_outputs/96/rnn_metalearn_controller_experiment_mlfs/
import joblib
import re
from pathlib2 import Path
from metalearn import utils
best_mlfs = []
for job_id in JOB_NUMS:
job_output_fp = Path("../floyd_outputs/%s" % job_id)
for fp in job_output_fp.glob("metalearn_controller_mlfs_trial_*/*.pkl"):
mlf = joblib.load(fp)
episode = int(re.match("best_mlf_episode_(\d+).pkl", fp.name).group(1))
mlf_str = "NONE" if mlf is None else utils._ml_framework_string(mlf)
best_mlfs.append([job_id, JOB_ENTROPY_COEF_MAP.get(job_id), episode, mlf_str])
best_mlfs = pd.DataFrame(
best_mlfs, columns=["job_id", "entropy_coef", "episode", "mlf"])
best_mlfs.head()
The plot below is fairly messy looking if we look at all of the entropy_coef
conditions, so by default we only show the best MLFs proposed by the
entropy_coef = {0.1, 0.2} controllers. We can see that past the
100th episode of training, the entropy_coef=0.1 controller converges
on proposing OneHotEncoder > Imputer > MinMaxScaler > PCA > LogisticRegression
pretty much exclusively, while the entropy_coef=0.2 is still proposing
a wide variety of MLFs.
def create_mlf_timeline(x, y, entropy_coef, visible):
return go.Scatter(
x=x,
y=y,
name=entropy_coef,
mode='markers',
visible=visible,
opacity=0.7,
line=dict(width=1, color=COLORMAP[entropy_coef])
)
traces = []
for entropy_coef in sorted(JOB_ENTROPY_COEF_MAP.values()):
plot_mlf = best_mlfs.query("entropy_coef == %s" % entropy_coef)
x = plot_mlf.episode
y = plot_mlf.mlf
visible = True if entropy_coef in [0.1, 0.2] else "legendonly"
traces.append(create_mlf_timeline(x, y, entropy_coef, visible))
fig = go.Figure(
data=traces,
layout=dict(
height=600,
margin=dict(l=600),
hovermode="closest"
))
iplot(fig)
This experiment and previous baseline experiments demonstrate that this RL approach
to AutoML at least works to propose ML frameworks that produce high validation scores
(f1_score, in the experiments so far), and serves as a proof of concept.
At a high level, the next stages of this research are:
algorithm_components, i.e. sklearn Transformer and Estimator
classes so that controllers can propose a wider range of MLFs